Akshay Kapre, Dharmit Dalvi - Term project
The dataset is taken from Kaggle datasets, the link for which is as follows: https://www.kaggle.com/asaumya/healthcare-dataset-stroke-data#train_2v.csv
The dataset contains approximately 44,000 rows, and 12 columns. Each row contains data for a single patient. The columns include attributes for patients such as their age, gender, BMI, smoking status, etc, with which we can predict the final attribute: “stroke”, which predicts if the patient might suffer from a stroke or no.
The objective of this project is to gain insights on patients’ health through analysis and visualization.
We read the dataset, our health care stroke data set and replace all the mising values in one of our columns- smoking status.
data <- as.data.frame(read.csv("stroke_data.csv", header=T, sep=",", na.strings = c("", "NA")))
attach(data)
data$smoking_status <- as.character(data$smoking_status)
data$smoking_status <- ifelse(is.na(data$smoking_status),
'No information available', data$smoking_status)
The frequency for genders and a barplot for the same is calculated as follows: We have further calculated the same for only those patients who have stroke. (stroke = 1)
#categorical 1
table(data$gender)
##
## Female Male Other
## 25665 17724 11
barplot(table(data$gender), col = "cyan", ylim = c(0,25000), las = 2, xlab = "Gender", ylab = "Stroke")
yes_stroke <- data[data$stroke == 1, ]
table(yes_stroke$gender)
##
## Female Male Other
## 431 352 0
barplot(table(yes_stroke$gender), col = "cyan", ylim = c(0,500), las = 2, xlab = "Gender", ylab = "Stroke")
We can infer that the data has maximum number of females, followed by males and others.
We have further analysed the smoking status attribute, which is a categorical variable and visualized using a pie chart:
#categorical 2
smoking_status <- table(data$smoking_status)
slice.labels <- names(smoking_status)
slice.percents <- round(smoking_status/sum(smoking_status)*100)
slice.labels <- paste(slice.labels, slice.percents)
slice.labels <- paste(slice.labels, "%", sep = "")
pie(smoking_status, labels = slice.labels, col = hcl(c(0, 60, 120)))
Percentage of people who have never smoked is the highest (37 %).
We further plot a boxplot for the age attribute and perform a five number summary:
fivenum(data$age)
## [1] 0.08 24.00 44.00 60.00 82.00
boxplot(data$age, horizontal = TRUE, xaxt = "n", xlab = "Age of patients", col=hcl(1))
axis(side = 1, at=fivenum(data$age), labels = TRUE)
f <- fivenum(data$age)
f
## [1] 0.08 24.00 44.00 60.00 82.00
Next, we plot a ggplot for age, with respect to stroke values (1 or 0): We can see that for age = 80, the count for patients having stroke is maximum.
library(ggplot2)
ggplot(data, aes(x=age)) +
geom_histogram(color="black", fill="lightblue") + facet_grid(~stroke)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We further plot a boxplot of average glucose levels in patients that have a stroke and average glucose levels in patients that don’t have stroke.
no_stroke <- data[data$stroke == 0, ]
A <- no_stroke$avg_glucose_level
B <- yes_stroke$avg_glucose_level
boxplot(A,B, xaxt = "n", xlab = "Whether patients have stroke", ylab = "Glucose levels of patients", col=c("red", "green"))
The patients that don’t have stroke have lower average of glucose levels, the ones that have stroke have a greater average. The range of glucose values is between ~80 to 200 for patients having stroke, whereas there are a lot of outliers in glucose levels’ data for patients not having stroke.
Following is the scatterplot between three variables: age, hypertension and stroke.
#2
dt=data.frame(Age=data$age,hypertension=data$hypertension,Stroke=data$stroke)
head(dt,10)
## Age hypertension Stroke
## 1 3 0 0
## 2 58 1 0
## 3 8 0 0
## 4 70 0 0
## 5 14 0 0
## 6 47 0 0
## 7 52 0 0
## 8 75 0 0
## 9 32 0 0
## 10 74 1 0
plot(dt , pch=20 , cex=1.5 , col="red")
Major inferences that we can make from the above plot are ages for patients with stroke start from 35, whereas for hypertension, majority of patients having hypertension start from the age of 15 itself.
We have further plotted a ggplot for age vs heart disease, and we have the points colored based on work_type.
#3
ggplot(data = data) +
geom_point(mapping = aes(x = age, y = heart_disease, colour = data$work_type ))
We can infer that most patients with heart disease work in private sector.
Next we have boxplots of age, bmi and average glucose levels combined in a single plotly plot:
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
p <- plot_ly(data, x = data$age, type="box", name = 'age')
q <-add_trace(p, x = data$bmi, type="box", name = 'bmi')
w <-add_trace(q , x = data$avg_glucose_level,type = "box" , name ="glucose level" )
w
## Warning: Ignoring 1462 observations
The pmf plot and cdf plots for age are as follows:
#age
values <- data$age
tab <- table(values)
dframe <- as.data.frame(tab)
#dframe
x <- as.numeric(as.character(dframe$values))
# probability distribution is
f <- dframe$Freq / (sum(dframe$Freq))
# calculate the mean
mu <- sum(x * f)
mu
## [1] 42.21789
# variance of the distribution is
sigmaSquare <- sum((x - mu)^2 * f)
sigmaSquare
## [1] 507.1229
sigma <- sqrt(sigmaSquare)
sigma
## [1] 22.51939
plot(x, f, type = 'h', xlab = "Age of patients", ylab = "PMF", ylim = c(0, 0.04), main = "Spike plot for Age")
abline(h = 0 )
cdf <- c(0, cumsum(f))
cdfplot <- stepfun(x, cdf)
plot(cdfplot, verticals=FALSE, pch=16, main="CDF Plot for Age", xlab = "Age", ylab = "CDF")
Similar analysis of distribution for bmi is as follows:
#bmi
values <- data$bmi
tab <- table(values)
dframe <- as.data.frame(tab)
#dframe
x <- as.numeric(as.character(dframe$values))
# probability distribution is
f <- dframe$Freq / (sum(dframe$Freq))
# calculate the mean
mu <- sum(x * f)
mu
## [1] 28.60504
# variance of the distribution is
sigmaSquare <- sum((x - mu)^2 * f)
sigmaSquare
## [1] 60.37178
sigma <- sqrt(sigmaSquare)
sigma
## [1] 7.769928
plot(x, f, type = 'h', xlab = "BMI of patients", ylab = "PMF",ylim = c(0, 0.01), main = "Spike plot for BMI")
abline(h = 0 )
cdf <- c(0, cumsum(f))
cdfplot <- stepfun(x, cdf)
plot(cdfplot, verticals=FALSE, pch=16, main="CDF Plot for BMI", xlab = "BMI", ylab = "CDF")
We have applied Cental Limit Theorem on age attibute as follows:
age <-data$age
ctable <- table(age)
#ctable
mu <- mean(age)
mu
## [1] 42.21789
sigma <- sd(age)
sigma
## [1] 22.51965
dframe <- as.data.frame(ctable)
#dframe
x <- as.numeric(as.character(data$age))
#x
hist(x, probability = TRUE, xlim = c(0, 100), xlab = "age", ylab = "Density", main = "Histogram of age")
# sample size 5
samples <- 1000
sample_size <- 5
xbar <- numeric(samples)
for(i in 1:samples){
xbar[i] = mean(sample(x, size = sample_size, replace = T))
}
hist(xbar, prob = T, xlab = "Age",
main = "Densities of age with sample size 5", col = "blue")
mean1 <- mean(xbar)
sd1 <- sd(xbar)
mean1
## [1] 42.10943
sd1
## [1] 10.02085
#sample size 20
samples <- 1000
sample_size <- 20
xbar <- numeric(samples)
for(i in 1:samples){
xbar[i] = mean(sample(x, size = sample_size, replace = T))
}
hist(xbar, prob = T, xlab = "Age",
main = "Densities of age with sample size 20", col = "red")
mean2 <- mean(xbar)
sd2 <- sd(xbar)
mean2
## [1] 42.40183
sd2
## [1] 4.982658
#sample size 50
samples <- 1000
sample_size <- 50
xbar <- numeric(samples)
for(i in 1:samples){
xbar[i] = mean(sample(x, size = sample_size, replace = T))
}
hist(xbar, prob = T, xlab = "Age",
main = "Densities of age with sample size 50", col = "green")
mean3 <- mean(xbar)
sd3 <- sd(xbar)
mean3
## [1] 42.19804
sd3
## [1] 3.223502
cat("1st distribution:\nMean =",mean1,"\nSD =",sd1)
## 1st distribution:
## Mean = 42.10943
## SD = 10.02085
cat("2nd distribution:\nMean =",mean2,"\nSD =",sd2)
## 2nd distribution:
## Mean = 42.40183
## SD = 4.982658
cat("3rd distribution:\nMean =",mean3,"\nSD =",sd3)
## 3rd distribution:
## Mean = 42.19804
## SD = 3.223502
Similarly, We have applied Cental Limit Theorem on average glucose level attibute as follows:
glucose <-data$avg_glucose_level
ctable <- table(glucose)
#ctable
mu <- mean(glucose)
mu
## [1] 104.4827
sigma <- sd(glucose)
sigma
## [1] 43.11175
dframe <- as.data.frame(ctable)
#dframe
x <- as.numeric(as.character(data$avg_glucose_level))
#x
max(data$avg_glucose_level)
## [1] 291.05
hist(x, probability = TRUE, xlim = c(0,300 ), xlab = "Average Glucose Level", ylab = "Density", main = "Histogram of Average Glucose Level")
#sample size 5
samples <- 1000
sample_size <- 5
xbar <- numeric(samples)
for(i in 1:samples){
xbar[i] = mean(sample(x, size = sample_size, replace = T))
}
hist(xbar, prob = T,xlim = c(0,200), xlab = "Avg Glucose level",
main = "Densities of Average Glucose Level with sample size 5", col = "blue")
mean1 <- mean(xbar)
sd1 <- sd(xbar)
mean1
## [1] 105.1466
sd1
## [1] 19.01525
#sample size 20
samples <- 1000
sample_size <- 20
xbar <- numeric(samples)
for(i in 1:samples){
xbar[i] = mean(sample(x, size = sample_size, replace = T))
}
hist(xbar, prob = T,xlim = c(0,200), xlab = "Average Glucose Level",
main = "Densities of Average Glucose Level with sample size 20", col = "red")
mean2 <- mean(xbar)
sd2 <- sd(xbar)
mean2
## [1] 104.3902
sd2
## [1] 9.338029
#sample size 50
samples <- 1000
sample_size <- 50
xbar <- numeric(samples)
for(i in 1:samples){
xbar[i] = mean(sample(x, size = sample_size, replace = T))
}
hist(xbar, prob = T,xlim = c(0,200), xlab = "Average Glucose Level",
main = "Densities of Average Glucose Level with sample size 20", col = "green")
mean3 <- mean(xbar)
sd3 <- sd(xbar)
mean3
## [1] 104.4944
sd3
## [1] 6.242851
cat("1st distribution:\nMean =",mean1,"\nSD =",sd1)
## 1st distribution:
## Mean = 105.1466
## SD = 19.01525
cat("2nd distribution:\nMean =",mean2,"\nSD =",sd2)
## 2nd distribution:
## Mean = 104.3902
## SD = 9.338029
cat("3rd distribution:\nMean =",mean3,"\nSD =",sd3)
## 3rd distribution:
## Mean = 104.4944
## SD = 6.242851
In both cases, our mean remains almost the same, but as sample size increases, standard deviation decreases, proving the applicability of Central Limit Theorem on both of our attributes.
We further performed sampling on age attributes of our data:
#sampling
library(sampling)
table(data$age)
##
## 0.08 0.16 0.24 0.32 0.4 0.48 0.56 0.64 0.72 0.8 0.88 1 1.08 1.16 1.24
## 17 26 50 53 35 37 47 58 66 61 46 34 62 48 44
## 1.32 1.4 1.48 1.56 1.64 1.72 1.8 1.88 2 3 4 5 6 7 8
## 52 55 45 63 60 57 62 47 479 402 356 533 246 355 436
## 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## 353 313 330 398 419 443 452 426 475 498 472 504 523 503 508
## 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
## 506 491 503 558 540 564 525 592 533 504 540 501 535 564 617
## 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
## 561 574 592 580 585 671 666 649 684 652 668 694 738 721 701
## 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
## 658 671 655 690 623 618 616 612 598 589 553 529 537 459 465
## 69 70 71 72 73 74 75 76 77 78 79 80 81 82
## 441 446 415 384 352 338 357 336 344 698 626 543 454 435
hist(data$age)
mean_without_sampling <- mean(data$age)
sd(data$age)
## [1] 22.51965
Following are various sampling methods for sample size 500:
#sample size = 500
# srswor
sample.size <- 500
s <- srswor(sample.size,nrow(data))
sample.1 <- data[s != 0, ]
mean_srswor <- mean(sample.1$age)
#srswr
set.seed(153)
s <- srswr(sample.size, nrow(data))
sample.2 <- data[s != 0, ]
mean_srswr <- mean(sample.2$age)
#Systematic Sampling
N <- nrow(data)
n <- 1000
k <- ceiling(N / n)
k
## [1] 44
r <- sample(k, 1)
r
## [1] 22
s <- seq(r, by = k, length = n)
#s
sample.3 <- data[s, ]
table(sample.3$age)
##
## 0.08 0.24 0.32 0.4 0.72 0.8 0.88 1 1.08 1.32 1.4 1.56 1.64 1.72 1.8
## 1 3 2 1 1 1 1 1 2 3 1 1 1 1 3
## 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 9 9 9 16 7 8 11 3 4 6 10 7 8 6 5
## 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
## 10 10 11 10 14 17 9 11 9 15 9 11 12 13 15
## 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
## 12 8 10 9 12 14 15 15 16 17 14 13 19 15 8
## 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
## 19 12 19 14 18 17 16 14 13 12 18 15 14 13 16
## 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
## 17 16 8 8 14 10 7 11 13 9 5 8 9 9 10
## 77 78 79 80 81 82
## 10 13 19 24 12 10
mean_systematic <- mean(sample.3$age)
#Systematic Sampling with unequal probabilities
pik <- inclusionprobabilities(data$age, sample.size)
s <- UPsystematic(pik)
sample.4 <- data[s != 0, ]
table(sample.4$age)
##
## 6 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
## 1 3 1 1 1 1 2 1 4 2 2 2 4 3 4 3 5 4 6 2 1 7 3 8 4
## 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
## 4 5 8 4 5 10 4 2 9 3 4 6 5 10 12 5 4 11 7 11 11 7 12 12 4
## 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
## 12 11 15 5 8 14 6 9 12 5 8 11 5 6 10 9 8 6 6 6 6 18 15 16 10
## 82
## 13
data["age_range"] = NA
data$age_range <- cut(data$age, breaks = c(0, 25, 50, 75, Inf), labels = c('A', 'B', 'C', 'D'))
data_age <- data.frame(
age = data$age,
age_range = data$age_range
)
freq <- table(data_age$age_range)
freq
##
## A B C D
## 11546 14652 13766 3436
set.seed(123)
head(data_age)
## age age_range
## 1 3 A
## 2 58 C
## 3 8 A
## 4 70 C
## 5 14 A
## 6 47 B
st.sizes <- sample.size * freq / sum(freq)
st.1 <- strata(data_age, stratanames = c("age_range"),
size = st.sizes, method = c("srswor"),
description = TRUE)
## Stratum 1
##
## Population total and number of selected units: 11546 133.0184
## Stratum 2
##
## Population total and number of selected units: 13766 168.8018
## Stratum 3
##
## Population total and number of selected units: 14652 158.5945
## Stratum 4
##
## Population total and number of selected units: 3436 39.58525
## Number of strata 4
## Total number of selected units 500
#st.1
st.sample1 <- getdata(data, st.1)
#st.sample1
# cluster sampling
cl <- cluster(data, c("age"), size = 4, method = "srswor")
sample.6 <- getdata(data, cl)
table(sample.6$age)
##
## 15 16 50 60
## 452 426 694 616
mean_cluster <- mean(sample.6$age)
For sample size 500, the means through all sampling methods’ samples are as follows:
#sample size 500
mean_without_sampling
## [1] 42.21789
mean_srswor
## [1] 42.27424
mean_srswr
## [1] 42.98635
mean_systematic
## [1] NA
mean_cluster
## [1] 38.96527
Following are various sampling methods for sample size 1000:
#sample size = 1000
# srswor
sample.size <- 1000
s <- srswor(sample.size,nrow(data))
sample.1 <- data[s != 0, ]
mean_srswor <- mean(sample.1$age)
#srswr
set.seed(153)
s <- srswr(sample.size, nrow(data))
sample.2 <- data[s != 0, ]
mean_srswr <- mean(sample.2$age)
#Systematic Sampling
N <- nrow(data)
n <- 1000
k <- ceiling(N / n)
k
## [1] 44
r <- sample(k, 1)
r
## [1] 19
s <- seq(r, by = k, length = n)
s
## [1] 19 63 107 151 195 239 283 327 371 415 459
## [12] 503 547 591 635 679 723 767 811 855 899 943
## [23] 987 1031 1075 1119 1163 1207 1251 1295 1339 1383 1427
## [34] 1471 1515 1559 1603 1647 1691 1735 1779 1823 1867 1911
## [45] 1955 1999 2043 2087 2131 2175 2219 2263 2307 2351 2395
## [56] 2439 2483 2527 2571 2615 2659 2703 2747 2791 2835 2879
## [67] 2923 2967 3011 3055 3099 3143 3187 3231 3275 3319 3363
## [78] 3407 3451 3495 3539 3583 3627 3671 3715 3759 3803 3847
## [89] 3891 3935 3979 4023 4067 4111 4155 4199 4243 4287 4331
## [100] 4375 4419 4463 4507 4551 4595 4639 4683 4727 4771 4815
## [111] 4859 4903 4947 4991 5035 5079 5123 5167 5211 5255 5299
## [122] 5343 5387 5431 5475 5519 5563 5607 5651 5695 5739 5783
## [133] 5827 5871 5915 5959 6003 6047 6091 6135 6179 6223 6267
## [144] 6311 6355 6399 6443 6487 6531 6575 6619 6663 6707 6751
## [155] 6795 6839 6883 6927 6971 7015 7059 7103 7147 7191 7235
## [166] 7279 7323 7367 7411 7455 7499 7543 7587 7631 7675 7719
## [177] 7763 7807 7851 7895 7939 7983 8027 8071 8115 8159 8203
## [188] 8247 8291 8335 8379 8423 8467 8511 8555 8599 8643 8687
## [199] 8731 8775 8819 8863 8907 8951 8995 9039 9083 9127 9171
## [210] 9215 9259 9303 9347 9391 9435 9479 9523 9567 9611 9655
## [221] 9699 9743 9787 9831 9875 9919 9963 10007 10051 10095 10139
## [232] 10183 10227 10271 10315 10359 10403 10447 10491 10535 10579 10623
## [243] 10667 10711 10755 10799 10843 10887 10931 10975 11019 11063 11107
## [254] 11151 11195 11239 11283 11327 11371 11415 11459 11503 11547 11591
## [265] 11635 11679 11723 11767 11811 11855 11899 11943 11987 12031 12075
## [276] 12119 12163 12207 12251 12295 12339 12383 12427 12471 12515 12559
## [287] 12603 12647 12691 12735 12779 12823 12867 12911 12955 12999 13043
## [298] 13087 13131 13175 13219 13263 13307 13351 13395 13439 13483 13527
## [309] 13571 13615 13659 13703 13747 13791 13835 13879 13923 13967 14011
## [320] 14055 14099 14143 14187 14231 14275 14319 14363 14407 14451 14495
## [331] 14539 14583 14627 14671 14715 14759 14803 14847 14891 14935 14979
## [342] 15023 15067 15111 15155 15199 15243 15287 15331 15375 15419 15463
## [353] 15507 15551 15595 15639 15683 15727 15771 15815 15859 15903 15947
## [364] 15991 16035 16079 16123 16167 16211 16255 16299 16343 16387 16431
## [375] 16475 16519 16563 16607 16651 16695 16739 16783 16827 16871 16915
## [386] 16959 17003 17047 17091 17135 17179 17223 17267 17311 17355 17399
## [397] 17443 17487 17531 17575 17619 17663 17707 17751 17795 17839 17883
## [408] 17927 17971 18015 18059 18103 18147 18191 18235 18279 18323 18367
## [419] 18411 18455 18499 18543 18587 18631 18675 18719 18763 18807 18851
## [430] 18895 18939 18983 19027 19071 19115 19159 19203 19247 19291 19335
## [441] 19379 19423 19467 19511 19555 19599 19643 19687 19731 19775 19819
## [452] 19863 19907 19951 19995 20039 20083 20127 20171 20215 20259 20303
## [463] 20347 20391 20435 20479 20523 20567 20611 20655 20699 20743 20787
## [474] 20831 20875 20919 20963 21007 21051 21095 21139 21183 21227 21271
## [485] 21315 21359 21403 21447 21491 21535 21579 21623 21667 21711 21755
## [496] 21799 21843 21887 21931 21975 22019 22063 22107 22151 22195 22239
## [507] 22283 22327 22371 22415 22459 22503 22547 22591 22635 22679 22723
## [518] 22767 22811 22855 22899 22943 22987 23031 23075 23119 23163 23207
## [529] 23251 23295 23339 23383 23427 23471 23515 23559 23603 23647 23691
## [540] 23735 23779 23823 23867 23911 23955 23999 24043 24087 24131 24175
## [551] 24219 24263 24307 24351 24395 24439 24483 24527 24571 24615 24659
## [562] 24703 24747 24791 24835 24879 24923 24967 25011 25055 25099 25143
## [573] 25187 25231 25275 25319 25363 25407 25451 25495 25539 25583 25627
## [584] 25671 25715 25759 25803 25847 25891 25935 25979 26023 26067 26111
## [595] 26155 26199 26243 26287 26331 26375 26419 26463 26507 26551 26595
## [606] 26639 26683 26727 26771 26815 26859 26903 26947 26991 27035 27079
## [617] 27123 27167 27211 27255 27299 27343 27387 27431 27475 27519 27563
## [628] 27607 27651 27695 27739 27783 27827 27871 27915 27959 28003 28047
## [639] 28091 28135 28179 28223 28267 28311 28355 28399 28443 28487 28531
## [650] 28575 28619 28663 28707 28751 28795 28839 28883 28927 28971 29015
## [661] 29059 29103 29147 29191 29235 29279 29323 29367 29411 29455 29499
## [672] 29543 29587 29631 29675 29719 29763 29807 29851 29895 29939 29983
## [683] 30027 30071 30115 30159 30203 30247 30291 30335 30379 30423 30467
## [694] 30511 30555 30599 30643 30687 30731 30775 30819 30863 30907 30951
## [705] 30995 31039 31083 31127 31171 31215 31259 31303 31347 31391 31435
## [716] 31479 31523 31567 31611 31655 31699 31743 31787 31831 31875 31919
## [727] 31963 32007 32051 32095 32139 32183 32227 32271 32315 32359 32403
## [738] 32447 32491 32535 32579 32623 32667 32711 32755 32799 32843 32887
## [749] 32931 32975 33019 33063 33107 33151 33195 33239 33283 33327 33371
## [760] 33415 33459 33503 33547 33591 33635 33679 33723 33767 33811 33855
## [771] 33899 33943 33987 34031 34075 34119 34163 34207 34251 34295 34339
## [782] 34383 34427 34471 34515 34559 34603 34647 34691 34735 34779 34823
## [793] 34867 34911 34955 34999 35043 35087 35131 35175 35219 35263 35307
## [804] 35351 35395 35439 35483 35527 35571 35615 35659 35703 35747 35791
## [815] 35835 35879 35923 35967 36011 36055 36099 36143 36187 36231 36275
## [826] 36319 36363 36407 36451 36495 36539 36583 36627 36671 36715 36759
## [837] 36803 36847 36891 36935 36979 37023 37067 37111 37155 37199 37243
## [848] 37287 37331 37375 37419 37463 37507 37551 37595 37639 37683 37727
## [859] 37771 37815 37859 37903 37947 37991 38035 38079 38123 38167 38211
## [870] 38255 38299 38343 38387 38431 38475 38519 38563 38607 38651 38695
## [881] 38739 38783 38827 38871 38915 38959 39003 39047 39091 39135 39179
## [892] 39223 39267 39311 39355 39399 39443 39487 39531 39575 39619 39663
## [903] 39707 39751 39795 39839 39883 39927 39971 40015 40059 40103 40147
## [914] 40191 40235 40279 40323 40367 40411 40455 40499 40543 40587 40631
## [925] 40675 40719 40763 40807 40851 40895 40939 40983 41027 41071 41115
## [936] 41159 41203 41247 41291 41335 41379 41423 41467 41511 41555 41599
## [947] 41643 41687 41731 41775 41819 41863 41907 41951 41995 42039 42083
## [958] 42127 42171 42215 42259 42303 42347 42391 42435 42479 42523 42567
## [969] 42611 42655 42699 42743 42787 42831 42875 42919 42963 43007 43051
## [980] 43095 43139 43183 43227 43271 43315 43359 43403 43447 43491 43535
## [991] 43579 43623 43667 43711 43755 43799 43843 43887 43931 43975
sample.3 <- data[s, ]
table(sample.3$age)
##
## 0.08 0.32 0.4 0.48 0.56 0.64 0.8 0.88 1 1.24 1.32 1.4 1.48 1.64 1.72
## 1 1 1 2 1 1 1 1 1 1 1 3 1 1 2
## 1.8 1.88 2 3 4 5 6 7 8 9 10 11 12 13 14
## 2 2 17 9 8 7 6 13 11 10 11 7 11 5 12
## 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
## 7 5 9 16 12 14 14 13 13 12 10 15 14 10 18
## 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
## 20 13 13 9 13 15 9 10 17 8 9 19 14 14 16
## 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
## 14 14 8 16 6 14 9 18 18 19 21 12 14 16 16
## 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
## 23 9 10 15 7 18 9 6 18 9 12 7 7 9 6
## 75 76 77 78 79 80 81 82
## 9 6 9 15 12 7 8 9
mean_systematic <- mean(sample.3$age)
#Systematic Sampling with unequal probabilities
pik <- inclusionprobabilities(data$age, sample.size)
s <- UPsystematic(pik)
sample.4 <- data[s != 0, ]
table(sample.4$age)
##
## 4 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
## 2 3 1 2 3 1 6 3 1 4 4 3 5 4 3 4 8 5 2 3 5 8 9 4 4
## 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 11 10 10 8 8 17 15 15 12 15 16 9 14 9 22 15 15 19 18 19 15 28 22 25 18
## 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
## 20 20 20 21 24 18 18 17 27 15 18 17 8 14 11 22 23 16 18 15 14 12 17 27 35
## 80 81 82
## 27 11 13
# Stratified sampling
freq <- table(data_age$age_range)
freq
##
## A B C D
## 11546 14652 13766 3436
set.seed(123)
head(data_age)
## age age_range
## 1 3 A
## 2 58 C
## 3 8 A
## 4 70 C
## 5 14 A
## 6 47 B
st.sizes <- sample.size * freq / sum(freq)
st.1 <- strata(data_age, stratanames = c("age_range"),
size = st.sizes, method = c("srswor"),
description = TRUE)
## Stratum 1
##
## Population total and number of selected units: 11546 266.0369
## Stratum 2
##
## Population total and number of selected units: 13766 337.6037
## Stratum 3
##
## Population total and number of selected units: 14652 317.1889
## Stratum 4
##
## Population total and number of selected units: 3436 79.17051
## Number of strata 4
## Total number of selected units 1000
#st.1
st.sample1 <- getdata(data, st.1)
#st.sample1
# cluster sampling
cl <- cluster(data, c("age"), size = 4, method = "srswor")
sample.6 <- getdata(data, cl)
table(sample.6$age)
##
## 1 1.4 7 39
## 34 55 355 561
mean_cluster <- mean(sample.6$age)
Following are the mean values for all of our sampling methods:
#sample size 1000
mean_without_sampling
## [1] 42.21789
mean_srswor
## [1] 42.91236
mean_srswr
## [1] 42.90006
mean_systematic
## [1] NA
mean_cluster
## [1] 24.35323
Following is an R code for finding if a patient with an input ID has a stroke or no. (Please refer to the .R file for code to dynamically insert any id as input)
id <- 35327
id_stroke <- data[data$id == id, ]
if(id_stroke$stroke == 0){
print(paste("The patient with ID", id, "does not have a stroke"))
} else{
print(paste("The patient with ID", id, "has a stroke"))
}
## [1] "The patient with ID 35327 does not have a stroke"
Following is the word cloud for work type:
# Word cloud for work type
# Install
#install.packages("tm") # for text mining
#install.packages("wordcloud") # word-cloud generator
#install.packages("RColorBrewer") # color palettes
# Load
library("tm")
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library("SnowballC")
library("wordcloud")
## Loading required package: RColorBrewer
library("RColorBrewer")
yes_stroke <- data[data$stroke == 1, ]
table(yes_stroke$work_type)
##
## children Govt_job Never_worked Private Self-employed
## 2 89 0 441 251
workCorpus <- Corpus(VectorSource(yes_stroke$work_type))
workCorpus <- tm_map(workCorpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(workCorpus, content_transformer(tolower)):
## transformation drops documents
wordcloud(workCorpus, max.words = 100, random.order = FALSE)